/*
 Copyright 2013--Present JMM_PROGNAME
 
 This file is distributed under the terms of the JMM_PROGNAME License.
 
 You should have received a copy of the JMM_PROGNAME License.
 If not, see <JMM_PROGNAME WEBSITE>.
*/
// CREATED    : 8/26/2015
// LAST UPDATE: 9/30/2015

#include "statsxx/machine_learning/neural_network/deep_belief_network/DBN.hpp"

// STL
//#include <iostream> // std::cout, std::endl
#include <vector>   // std::vector<>

/*
// jNeuralNet
#include "datasets.hpp" // DataSet, partition_data_set()
*/
 
// stats++
#include "statsxx/machine_learning/restricted_Boltzmann_machine/RBM.hpp"  // RBM


inline neural_network::DBN::DBN(const std::vector<int> &arch)
{
    // first, given a user-supplied architecture, extract the architectual details ...
    // architecture[] =
    //     ni
    //     nh1
    //     nh2
    //     .
    //     .
    //     .
    //     nhl
    //     no
    
    this->architecture = arch;
    
    // next, setup each RBM ...
    // note: a RBM is not setup containing the output node as a hidden node
    for(int i = 0; i < (this->architecture.size()-2); ++i)
    {
        // note: the architecture of each hidden RBMs is specified by (this->architecture[i], this->architecture[i+1])
        RBM.push_back(restricted_Boltzmann_machine::RBM(this->architecture[i], this->architecture[i+1]));
    }
    
    // finally, setup the MLP that will share weights with the stacked RBMs ...
    // << jmm: this is done during the finetune step >>
}

inline neural_network::DBN::~DBN() {};







/*
//========================================================================
//========================================================================
//
// NAME: template<typename T>
//       void neural_network::DBN_T<T>::pretrain(const int k,
//                                               const Matrix<T> &training_data,
//                                               const double lr,
//                                               const int max_epochs,
//                                               const int epoch_out)
//
// DESC: (Pre-)trains the DBN, using greedy layer-by-layer training.
//
// INPUT:
//     int         k           : number of sampling steps (must be at least 1)
//     Matrix<T> training_data : data matrix with training data
//     double      lr          : learning rate
//     int         max_epochs  : maximum number of epochs
//     int         epoch_out   : number of epochs between output
//
// OUTPUT: None ... the internal states of the RBMs are modified.
//
// NOTES:
//     - The settings above are identical to training a single RBM, and are used uniformly for each RBM.
//
//========================================================================
//========================================================================
template<typename T>
void neural_network::DBN_T<T>::pretrain(const int k,
                                        const Matrix<T> &training_data,
                                        const double lr,
                                        const int max_epochs,
                                        const int epoch_out)
{
    // note: after each layer of training, training data for the next RBM is transformed via sampling 
    Matrix<int> sampled_data(training_data.size(0), this->architecture[1]);
    
    // TRAIN INITIAL RBM (RMB_0) ...
    std::cout << "training initial RBM (RBM_0) ..." << '\n';
    
    if(dynamic_cast<Restricted_Boltzmann_machine::RBM*>(this->RBM_0.get()))
    {
        // convert training data ...
        // note: for the initial RBM (RMB_0), we must convert the training data into the appropriate type (int for a RBM) -- because, for example, a CDBN doesn't know at compile time that it won't enter this subroutine, which you can't call with Matrix<double> training data  
        Matrix<int> training_data_int(training_data.size(0), training_data.size(1));
        
        for(auto i = 0; i < training_data.size(0); ++i)
        {
            for(auto j = 0; j < training_data.size(1); ++j)
            {
                training_data_int(i,j) = static_cast<int>(training_data(i,j));
            }
        }
        
        // train the RBM ...
        dynamic_cast<Restricted_Boltzmann_machine::RBM*>(this->RBM_0.get())->train(k, training_data_int, lr, max_epochs, epoch_out);
        
        // transform the training data via sampling ...
        // note: the following code is basically repeated for the DBNs and the CDBNs (below), but avoids a dynamic_cast<> at every training data point
        for(auto j = 0; j < training_data.size(0); ++j)
        {
            Vector<double> h_mean;
            Vector<int> h_sample;
            std::tie(h_mean, h_sample) = dynamic_cast<Restricted_Boltzmann_machine::RBM*>(this->RBM_0.get())->sample_h_given_v(training_data_int.row(j));
            
            for(int l = 0; l < this->architecture[1]; ++l)
            {
                sampled_data(j,l) = h_sample(l);
            }
        }
    }
    else if(dynamic_cast<Restricted_Boltzmann_machine::CRBM*>(this->RBM_0.get()))
    {
        // see the notes above
        
        Matrix<double> training_data_dbl(training_data.size(0), training_data.size(1));
        
        for(auto i = 0; i < training_data.size(0); ++i)
        {
            for(auto j = 0; j < training_data.size(1); ++j)
            {
                training_data_dbl(i,j) = static_cast<double>(training_data(i,j));
            }
        }

        dynamic_cast<Restricted_Boltzmann_machine::CRBM*>(this->RBM_0.get())->train(k, training_data_dbl, lr, max_epochs, epoch_out);
        
        for(auto j = 0; j < training_data.size(0); ++j)
        {
            Vector<double> h_mean;
            Vector<int> h_sample;
            std::tie(h_mean, h_sample) = dynamic_cast<Restricted_Boltzmann_machine::CRBM*>(this->RBM_0.get())->sample_h_given_v(training_data_dbl.row(j));
            
            for(int l = 0; l < this->architecture[1]; ++l)
            {
                sampled_data(j,l) = h_sample(l);
            }
        }
    }
    
    std::cout << '\n';
    
    // (PRE-)TRAIN SUBSEQUENT RBMs, LAYER-BY-LAYER, USING THE PREVIOUSLY SAMPLED DATA ...
    for(auto i = 0; i < this->RBM_hid.size(); ++i)
    {
        std::cout << "training hidden RBM " << i << " ..." << '\n';
        
        // (pre-)train using the sampled (or resampled) data ...
        this->RBM_hid[i].train(k, sampled_data, lr, max_epochs, epoch_out);
        
        // if further RBMs exist, (re-)sample data ...
        if(i != (this->RBM_hid.size()-1))
        {
            Matrix<int> last_data = sampled_data;
            // note: recall that the architecture of the hidden RBMs is specified by (this->architecture[i+1], this->architecture[i+2])
            sampled_data = Matrix<int>(training_data.size(0), this->architecture[i+2]);
            
            for(auto j = 0; j < training_data.size(0); ++j)
            {
                Vector<double> h_mean;
                Vector<int> h_sample;
                std::tie(h_mean, h_sample) = this->RBM_hid[i].sample_h_given_v(last_data.row(j));
                
                for(int l = 0; l < this->architecture[i+2]; ++l)
                {
                    sampled_data(j,l) = h_sample(l);
                }
            }
        }
        
        std::cout << '\n';
    }
}


//========================================================================
//========================================================================
//
// NAME: template<typename T>
//       void neural_network::DBN_T<T>::finetune(const Matrix<T> &training_data_in,
//                                               const Matrix<double> &training_data_out,
//                                               const bool is_classif)
//
// DESC: Finetunes the DBN, by initializing a MLP with the same architecture and weights as the stacked RBMs, and used supervised backpropagation.
//
// INPUT:
//     int         k           : number of sampling steps (must be at least 1)
//     Matrix<T> training_data : data matrix with training data
//     double      lr          : learning rate
//     int         max_epochs  : maximum number of epochs
//     int         epoch_out   : number of epochs between output
//
// OUTPUT: None ... the internal states of the RBMs are modified.
//
// NOTES:
//     - The settings above are identical to training a single RBM, and are used uniformly for each RBM.
//
//========================================================================
//========================================================================
template<typename T>
void neural_network::DBN_T<T>::finetune(const Matrix<T> &training_data_in,
                                        const Matrix<double> &training_data_out,
                                        const bool is_classif)
{
    // CREATE THE MLP ...
    std::vector<int> nhn(this->nhl);
    
    for(int i = 0; i < this->nhl; ++i)
    {
        nhn[i] = this->architecture[i+1];
    }
    
    // map the weights from the RMBs to a vector suitable for initializing the MLP
    std::vector<double> RBM_w;
    
    Matrix<double> RBM_W;
    Vector<double> RBM_h_bias;
    
    if(dynamic_cast<Restricted_Boltzmann_machine::RBM*>(this->RBM_0.get()))
    {
        RBM_W = dynamic_cast<Restricted_Boltzmann_machine::RBM*>(this->RBM_0.get())->get_W();
        RBM_h_bias = dynamic_cast<Restricted_Boltzmann_machine::RBM*>(this->RBM_0.get())->get_h_bias();
    }
    else if(dynamic_cast<Restricted_Boltzmann_machine::CRBM*>(this->RBM_0.get()))
    {
        RBM_W = dynamic_cast<Restricted_Boltzmann_machine::CRBM*>(this->RBM_0.get())->get_W();
        RBM_h_bias = dynamic_cast<Restricted_Boltzmann_machine::CRBM*>(this->RBM_0.get())->get_h_bias();
    }
    
    for(int j = 0; j < RBM_W.size(1); ++j)
    {
        for(int i = 0; i < RBM_W.size(0); ++i)
        {
            RBM_w.push_back(RBM_W(i,j));
        }
        
        RBM_w.push_back(RBM_h_bias(j));
    }
    
    // perform the mapping for each stacked RBM
    for(auto i = 0; i < this->RBM_hid.size(); ++i)
    {
        RBM_W = this->RBM_hid[i].get_W();
        RBM_h_bias = this->RBM_hid[i].get_h_bias();
        
        for(int j = 0; j < RBM_W.size(1); ++j)
        {
            for(int k = 0; k < RBM_W.size(0); ++k)
            {
                RBM_w.push_back(RBM_W(k,j));
            }
            
            RBM_w.push_back(RBM_h_bias(j));
        }
    }
    
    // now add the output nodes, and initialize 
    int ntop_RBM_nodes = this->architecture[this->architecture.size()-2];
    
    // note: the +1 here and below is for the bias node
    double alpha = std::sqrt( 3.0/static_cast<double>(ntop_RBM_nodes+1) );
    
    for(int i = 0; i < this->no; ++i)
    {
        for(int j = 0; j < (ntop_RBM_nodes+1); ++j)
        {
            RBM_w.push_back( rand_num_uniform_Mersenne_twister(-alpha, alpha) );
        }
    }
    
    this->MLP.create_MLP(this->ni, this->no, this->nhl, nhn, false, false, is_classif, RBM_w);
    
    // transform the datasets in Matrix<> form to DataSet ...
    DataSet data_set(training_data_in.size(0), 1, this->ni, this->no);
    
    for(auto i = 0; i < training_data_in.size(0); ++i)
    {
        for(auto j = 0; j < training_data_in.size(1); ++j)
        {
            data_set.pt[i].in[0][j] = static_cast<double>(training_data_in(i,j));
        }
        
        for(auto j = 0; j < training_data_out.size(1); ++j)
        {
            data_set.pt[i].out[j] = training_data_out(i,j);
        }
    }
    
    std::vector<DataSet> data_sets = partition_data_set(data_set);
    
    int method   = 0;     // backpropagation
    int itranf   = 0;     // scaling only (should have been scaled prior pretraining though)
    double S_tol = 1.0;   // 
    bool silent  = false; //
    
    this->MLP.train(data_sets[0], data_sets[1], data_sets[2], method, itranf, S_tol, silent);
}


//========================================================================
//========================================================================
//
// NAME: Vector<double> neural_network::DBN_T<T>::predict(const Vector<double> &input)
//
// DESC: Predicts the output of the MLP for a given input. 
//
// INPUT:
//     Vector<double> input  : 
//
// OUTPUT:
//     Vector<double> output :
//
//========================================================================
//========================================================================
template<typename T>
Vector<double> neural_network::DBN_T<T>::predict(const Vector<double> &input)
{
    std::vector<std::vector<double>> MLP_input(1, std::vector<double>(input.size()));
    MLP_input[0] = input.std_vector();
    
    std::vector<double> MLP_output;
    this->MLP.evaluate(MLP_input, MLP_output);
    
    return Vector<double>(MLP_output);
}


//========================================================================
//========================================================================
//
// NAME: template<typename T>
//       std::vector<Vector<double>> neural_network::DBN_T<T>::get_features(const Vector<double> &input)
//
// DESC: Extracts the (lfeatures from the DBN 
//
// INPUT:
//     Vector<double> input  :
//
// OUTPUT:
//     Vector<double> output :
//
// NOTES:
//     - This is a fragile subroutine, since it simply mimicks how the MLP is setup in jNEURALNET (hence, if that changes, this breaks)
//
//========================================================================
//========================================================================
template<typename T>
std::vector<Vector<double>> neural_network::DBN_T<T>::get_features(const Vector<double> &input)
{
    // << jmm: we don't need to add the input and output features the way that is done below, we could directly use input and MLP_output ... it is just a debug test >>
    
    std::vector<Vector<double>> features;
    
    // start by getting the output, as this also activates ALL neurons
    Vector<double> MLP_output = this->predict(input);
 
    int cnt = 0;
    
    // get the input features (which is just the input)
    Vector<double> ni_features(this->ni);
    
    for(int i = 0; i < this->ni; ++i)
    {
        ni_features(i) = this->MLP.m_neurons[cnt].m_output.front();
        
        ++cnt;
    }
    
    features.push_back(ni_features);
    
    // note: we need to add the bias neuron here
    ++cnt;
    
    // get the features of each hidden layer
    for(int i = 0; i < this->nhl; ++i)
    {
        Vector<double> nhl_features(this->architecture[i+1]);
        
        for(int j = 0; j < this->architecture[i+1]; ++j)
        {
            nhl_features(j) = this->MLP.m_neurons[cnt].m_output.front();
            
            ++cnt;
        }
        
        features.push_back(nhl_features);
    }
    
    // get the features of the output layer
    Vector<double> no_features(this->no);
    
    for(int i = 0; i < this->no; ++i)
    {
        no_features(i) = this->MLP.m_neurons[cnt].m_output.front();
        
        ++cnt;
    }
    
    features.push_back(no_features);
    
    // << jmm: temporarily debug check that the input and output is correct >>
    
    
    for(int i = 0; i < this->ni; ++i)
    {
        if(features.front()(i) != input(i))
        {
            std::cout << "error in neural_network::DBN_T<T>::get_features(): input is incorrect" << '\n';
        }
    }
    
    for(int i = 0; i < this->no; ++i)
    {
        if(features.back()(i) != MLP_output(i))
        {
            std::cout << "error in neural_network::DBN_T<T>::get_features(): output is incorrect" << '\n';
        }
    }
    
    return features;
}




// void   evaluate(const std::vector<std::vector<double>> &input, std::vector<double> &output);

*/